export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_ENFORCE_CUDA_GRAPH=1
export VLLM_W8A8_MOE_USE_W4A8=1

vllm serve /home/weights/Qwen3-235B-A22B-int4-pack8 \
--served_model_name "qwen3-235" \
-tp 4 -pp 2 \
--enable-reasoning \
--reasoning-parser qwen3 \
--enable_chunked_prefill \
--enable-prefix-caching \
--disable_log_stats \
--disable_log_requests \
--host 0.0.0.0 --port 8000


# vllm serve /home/weights/Qwen3-235B-A22B-int4-pack8 \
# --served_model_name "qwen3-235" \
# -tp 4 -pp 4 \
# --enable-reasoning \
# --reasoning-parser qwen3 \
# --enable_chunked_prefill \
# --enable-prefix-caching \
# --disable_log_stats \
# --disable_log_requests \
# --host 0.0.0.0 --port 8000



# --max-model-len $[128*1024] \
# --hf-overrides '{"rope_scaling": {"rope_type": "yarn", "factor": 4.0, "original_max_position_embeddings": 32768}}'
